# Clears work environment
rm(list = ls())

# Sets working directory
setwd('C:/Users/Jason/Box Sync/Home Folder jdt34/733 - Maximum Likelihood Estimation/Lai & Slater 2006 replication')

# Loads data and libraries
load(file='cleanedLSdata.RData')
loadPkg(packs)

# Defines function to perform non-random 5-fold cross-validation, folded by decades and 
# random folds with the negative binomial model
lscv = function(data, formula, ivs, IVs, dv) {
  RMSE = function(mod){sqrt(mean(residuals(mod, type='response')^2))}
  original = glm.nb(formula, data, control=glm.control(maxit=100))
  data$decade = NA
  data$decade[which(data$year<1960)] = 5
  data$decade[which(data$year>1959 & data$year<1970)] = 6
  data$decade[which(data$year>1969 & data$year<1980)] = 7
  data$decade[which(data$year>1979 & data$year<1990)] = 8
  data$decade[which(data$year>1989 & data$year<1993)] = 9
  rmse.decade = rmse.random = RMSE(original)
  model.decade = model.random = list()
  ci.decade = ci.random = list()
  betas.decade = betas.random = cbind(data.frame(t(apply(negbdraws, 2, FUN=cimaker)))[2:10,], fold=0)
  source(paste0(getwd(), '/sndwch.R'))
  origVCov = sndwch(model=original, family='Negative Binomial', 
                    rank=original$rank, id=data$ccode, small.n=F)
  coefs.reduce = cbind(coef(original), sqrt(diag(origVCov)), fold=0)
  coefs.decade = coefs.random = coefs.reduce[2:dim(coefs.reduce)[1],]
  for(i in 1:5) {
    set.seed(6886)
    dec = i + 4
    train = data[which(data$decade!=dec),]
    test = data[which(data$decade==dec),]
    trainModel = glm.nb(formula, train, control=glm.control(maxit=100))
    model.decade[[i]] = trainModel
    trainVCov = sndwch(model=trainModel, family='Negative Binomial', 
                       rank=trainModel$rank, id=train$ccode, small.n=F)    
    trainResults = cbind(coef(trainModel), sqrt(diag(trainVCov)), fold=dec)
    trainResults = trainResults[2:dim(trainResults)[1],]
    coefs.decade = rbind(coefs.decade, trainResults)
    testPreds = exp(data.matrix(cbind(1, test[,ivs])) %*% coef(trainModel))
    rmse.decade = c(rmse.decade, sqrt(mean((test[,dv] - testPreds)^2)))
    rdraw = mvrnorm(sims, coef(trainModel), trainVCov)
    pdict = exp(data.matrix(cbind(1, test[,ivs])) %*% t(rdraw))
    quant = data.frame(t(apply(pdict, 1, FUN=cimaker)))
    colnames(quant) = c('low95', 'low90', 'est', 'upp90', 'upp95')
    ci.decade[[i]] = quant
    betas.decade = rbind(betas.decade, cbind(data.frame(t(apply(rdraw, 2, FUN=cimaker)))[2:10,], fold=dec))
  }
  colnames(betas.decade) = c('low95', 'low90', 'est', 'upp90', 'upp95', 'fold')
  betas.decade$IV = factor(IVs, levels=IVs)
  tmp.decade.label = paste(c('Full:', "1950's:", "1960's:", "1970's:", "1980's:", "1990's:"), 
                           as.character(round(rmse.decade, 2)), ' ')
  tmp.decade = ggplot(betas.decade, aes(x=factor(fold), y=est, ymin=low95, ymax=upp95, color=factor(fold))) +
    geom_linerange(size=.4) +
    geom_linerange(size=.8, aes(ymin=low90, ymax=upp90)) +
    geom_errorbar(width=.25) +
    geom_point(size=3, shape=18) +
    facet_wrap(~IV, scales='free_y') +
    geom_hline(aes(y=0), color='grey50', linetype=2) +
    scale_color_manual(values=c('black', hcl(h=seq(15, 375-360/5, length=5)%%360, c=100, l=65)), 
                       name='RMSE \nby fold', labels=tmp.decade.label) +
    scale_linetype_manual(values=c(1,2)) +
    theme(axis.text.x=element_blank(),
          axis.text.y=element_blank(),
          axis.title.x=element_blank(),
          axis.title.y=element_blank(),
          axis.ticks.x=element_blank(),
          axis.ticks.y=element_blank(),
          panel.grid.minor.y=element_blank(),
          panel.grid.major.x=element_blank(),
          legend.position='bottom', 
          legend.key=element_rect(fill=NA),
          legend.justification=c(1, 0)) + 
    guides(color=guide_legend(nrow=2, byrow=T))  
  data$rand = sample(1:5, nrow(data), replace=T)
  for(i in 1:5) {
    train = data[which(data$rand!=i),]
    test = data[which(data$rand==i),]
    trainModel = glm.nb(formula, train, control=glm.control(maxit=100))
    model.random[[i]] = trainModel
    trainVCov = sndwch(model=trainModel, family='Negative Binomial', 
                       rank=trainModel$rank, id=train$ccode, small.n=F)    
    trainResults = cbind(coef(trainModel), sqrt(diag(trainVCov)), fold=i)
    trainResults = trainResults[2:dim(trainResults)[1],]
    coefs.random = rbind(coefs.random, trainResults)
    testPreds = exp(data.matrix(cbind(1, test[,ivs])) %*% coef(trainModel))
    rmse.random = c(rmse.random, sqrt(mean((test[,dv] - testPreds)^2)))
    rdraw = mvrnorm(sims, coef(trainModel), trainVCov)
    pdict = exp(data.matrix(cbind(1, test[,ivs])) %*% t(rdraw))
    quant = data.frame(t(apply(pdict, 1, FUN=cimaker)))
    colnames(quant) = c('low95', 'low90', 'est', 'upp90', 'upp95')
    ci.random[[i]] = quant
    betas.random = rbind(betas.random, cbind(data.frame(t(apply(rdraw, 2, FUN=cimaker)))[2:10,], fold=i))
  }
  colnames(betas.random) = c('low95', 'low90', 'est', 'upp90', 'upp95', 'fold')
  betas.random$IV = factor(IVs, levels=IVs)
  tmp.random.label = paste(c('Full:', "A:", "B:", "C:", "D:", "E:"), 
                           as.character(round(rmse.random, 2)), c('  ', rep('     ', 5)))
  tmp.random = ggplot(betas.random, aes(x=factor(fold), y=est, ymin=low95, ymax=upp95, color=factor(fold))) +
    geom_linerange(size=.4) +
    geom_linerange(size=.8, aes(ymin=low90, ymax=upp90)) +
    geom_errorbar(width=.25) +
    geom_point(size=3, shape=18) +
    facet_wrap(~IV, scales='free_y') +
    geom_hline(aes(y=0), color='grey50', linetype=2) +
    scale_color_manual(values=c('black', hcl(h=seq(15, 375-360/5, length=5)%%360, c=100, l=65)), 
                       name='RMSE \nby fold', labels=tmp.random.label) +
    scale_linetype_manual(values=c(1,2)) +
    theme(axis.text.x=element_blank(),
          axis.text.y=element_blank(),
          axis.title.x=element_blank(),
          axis.title.y=element_blank(),
          axis.ticks.x=element_blank(),
          axis.ticks.y=element_blank(),
          panel.grid.minor.y=element_blank(),
          panel.grid.major.x=element_blank(),
          legend.position='bottom', 
          legend.key=element_rect(fill=NA),
          legend.justification=c(1, 0)) + 
    guides(color=guide_legend(nrow=2, byrow=T))  
  output = list()
  output[[1]] = tmp.decade
  output[[2]] = tmp.random
  return(output)
}
lsmodel = lscv(lsm, form, ivs, IVs, dv)
ggsave(file='nb_cv_decade.pdf', lsmodel[[1]], width=6, height=7, units='in')  
ggsave(file='nb_cv_random.pdf', lsmodel[[2]], width=6, height=7, units='in')  
nbcvs = arrangeGrob(lsmodel[[1]], lsmodel[[2]], ncol=2)

# Defines function to perform non-random 5-fold cross-validation, folded by decades and 
# random folds with the zero-inflated negative binomial model
mycv = function(data, formula, ivs, IVs, dv) {
  RMSE = function(mod){sqrt(mean(mod$residuals^2))}
  original = zeroinfl(formula, data, dist='negbin')
  data$decade = NA
  data$decade[which(data$year<1960)] = 5
  data$decade[which(data$year>1959 & data$year<1970)] = 6
  data$decade[which(data$year>1969 & data$year<1980)] = 7
  data$decade[which(data$year>1979 & data$year<1990)] = 8
  data$decade[which(data$year>1989 & data$year<1993)] = 9
  rmse.decade = rmse.random = RMSE(original)
  model.decade = model.random = list()
  ci.decade = ci.random = list()
  betas.decade = betas.random = cbind(data.frame(t(apply(zinbdraws, 2, FUN=cimaker)))[c(2:10, 12:20),], fold=0)
  source(paste0(getwd(), '/clrobustse.R'))
  origVCov = clrobustse(original, data$ccode)$vcov
  coefs.reduce = cbind(coef(original), sqrt(diag(origVCov)), fold=0)
  coefs.decade = coefs.random = coefs.reduce[c(2:10, 12:20),]
  for(i in 1:5) {
    set.seed(6886)
    dec = i + 4
    train = data[which(data$decade!=dec),]
    test = data[which(data$decade==dec),]
    trainModel = zeroinfl(formula, train, dist='negbin')
    model.decade[[i]] = trainModel
    trainVCov = clrobustse(trainModel, train$ccode)$vcov
    trainResults = cbind(coef(trainModel), sqrt(diag(trainVCov)), fold=dec)
    trainResults = trainResults[c(2:10, 12:20),]
    coefs.decade = rbind(coefs.decade, trainResults)    
    testCount = exp(data.matrix(cbind(1, test[,ivs])) %*% coef(trainModel)[1:10])
    testZeros = 1 / (1 + exp(-data.matrix(cbind(1, test[,ivs])) %*% coef(trainModel)[11:20]))
    testPreds = (1 - testZeros) * testCount
    rmse.decade = c(rmse.decade, sqrt(mean((test[,dv] - testPreds)^2)))
    rdraw = mvrnorm(sims, coef(trainModel), trainVCov)
    pdictC = exp(data.matrix(cbind(1, test[,ivs])) %*% t(rdraw[,1:10]))
    pdictZ = 1 / (1 + exp(-data.matrix(cbind(1, test[,ivs])) %*% t(rdraw[,11:20])))
    pdict = (1 - pdictZ) * pdictC
    quant = data.frame(t(apply(pdict, 1, FUN=cimaker)))
    colnames(quant) = c('low95', 'low90', 'est', 'upp90', 'upp95')
    ci.decade[[i]] = quant
    betas.decade = rbind(betas.decade, cbind(data.frame(t(apply(rdraw, 2, FUN=cimaker)))[c(2:10, 12:20),], fold=dec))
  }
  colnames(betas.decade) = c('low95', 'low90', 'est', 'upp90', 'upp95', 'fold')
  betas.decade$IV = factor(paste(IVs, c(rep('| count', 9), rep('| zero', 9))), 
                           levels=paste(IVs, c(rep('| count', 9), rep('| zero', 9)))[c(1:3, 10:12, 4:6, 13:15, 7:9, 16:18)])
  tmp.decade.label = paste(c('Full:', "1950's:", "1960's:", "1970's:", "1980's:", "1990's:"), 
                           as.character(round(rmse.decade, 2)), ' ')
  tmp.decade = ggplot(betas.decade, aes(x=factor(fold), y=est, ymin=low95, ymax=upp95, color=factor(fold))) +
    geom_linerange(size=.4) +
    geom_linerange(size=.8, aes(ymin=low90, ymax=upp90)) +
    geom_errorbar(width=.25) +
    geom_point(size=3, shape=18) +
    facet_wrap(~IV, scales='free_y', nrow=3) +
    geom_hline(aes(y=0), color='grey50', linetype=2) +
    scale_color_manual(values=c('black', hcl(h=seq(15, 375-360/5, length=5)%%360, c=100, l=65)), 
                       name='RMSE by fold', labels=tmp.decade.label) +
    scale_linetype_manual(values=c(1,2)) +
    theme(axis.text.x=element_blank(),
          axis.text.y=element_blank(),
          axis.title.x=element_blank(),
          axis.title.y=element_blank(),
          axis.ticks.x=element_blank(),
          axis.ticks.y=element_blank(),
          panel.grid.minor.y=element_blank(),
          panel.grid.major.x=element_blank(),
          legend.position='bottom', 
          legend.key=element_rect(fill=NA),
          legend.justification=c(1, 0))
  data$rand = sample(1:5, nrow(data), replace=T)
  for(i in 1:5) {
    train = data[which(data$rand!=i),]
    test = data[which(data$rand==i),]
    trainModel = zeroinfl(formula, train, dist='negbin')
    model.random[[i]] = trainModel
    trainVCov = clrobustse(trainModel, train$ccode)$vcov  
    trainResults = cbind(coef(trainModel), sqrt(diag(trainVCov)), fold=i)
    trainResults = trainResults[c(2:10, 12:20),]
    coefs.random = rbind(coefs.random, trainResults)
    testCount = exp(data.matrix(cbind(1, test[,ivs])) %*% coef(trainModel)[1:10])
    testZeros = 1 / (1 + exp(-data.matrix(cbind(1, test[,ivs])) %*% coef(trainModel)[11:20]))
    testPreds = (1 - testZeros) * testCount
    rmse.random = c(rmse.random, sqrt(mean((test[,dv] - testPreds)^2)))
    rdraw = mvrnorm(sims, coef(trainModel), trainVCov)
    pdictC = exp(data.matrix(cbind(1, test[,ivs])) %*% t(rdraw[,1:10]))
    pdictZ = 1 / (1 + exp(-data.matrix(cbind(1, test[,ivs])) %*% t(rdraw[,11:20])))
    pdict = (1 - pdictZ) * pdictC
    quant = data.frame(t(apply(pdict, 1, FUN=cimaker)))
    colnames(quant) = c('low95', 'low90', 'est', 'upp90', 'upp95')
    ci.random[[i]] = quant
    betas.random = rbind(betas.random, cbind(data.frame(t(apply(rdraw, 2, FUN=cimaker)))[c(2:10, 12:20),], fold=i))
  }
  colnames(betas.random) = c('low95', 'low90', 'est', 'upp90', 'upp95', 'fold')
  betas.random$IV = factor(paste(IVs, c(rep('| count', 9), rep('| zero', 9))), 
                           levels=paste(IVs, c(rep('| count', 9), rep('| zero', 9)))[c(1:3, 10:12, 4:6, 13:15, 7:9, 16:18)])
  tmp.random.label = paste(c('Full:', "A:", "B:", "C:", "D:", "E:"), 
                           as.character(round(rmse.random, 2)), c('  ', rep('     ', 5)))
  tmp.random = ggplot(betas.random, aes(x=factor(fold), y=est, ymin=low95, ymax=upp95, color=factor(fold))) +
    geom_linerange(size=.4) +
    geom_linerange(size=.8, aes(ymin=low90, ymax=upp90)) +
    geom_errorbar(width=.25) +
    geom_point(size=3, shape=18) +
    facet_wrap(~IV, scales='free_y', nrow=3) +
    geom_hline(aes(y=0), color='grey50', linetype=2) +
    scale_color_manual(values=c('black', hcl(h=seq(15, 375-360/5, length=5)%%360, c=100, l=65)), 
                       name='RMSE by fold', labels=tmp.random.label) +
    scale_linetype_manual(values=c(1,2)) +
    theme(axis.text.x=element_blank(),
          axis.text.y=element_blank(),
          axis.title.x=element_blank(),
          axis.title.y=element_blank(),
          axis.ticks.x=element_blank(),
          axis.ticks.y=element_blank(),
          panel.grid.minor.y=element_blank(),
          panel.grid.major.x=element_blank(),
          legend.position='bottom', 
          legend.key=element_rect(fill=NA),
          legend.justification=c(1, 0))  
  output = list()
  output[[1]] = tmp.decade
  output[[2]] = tmp.random
  return(output)
}
mymodel = mycv(lsm, form, ivs, IVs, dv)
ggsave(file='zi_cv_decade.pdf', mymodel[[1]], width=12, height=7, units='in')  
ggsave(file='zi_cv_random.pdf', mymodel[[2]], width=12, height=7, units='in')  

allcvs = arrangeGrob(nbcvs, mymodel[[1]], mymodel[[2]], ncol=1)
ggsave(file='all_cv.pdf', allcvs, width=9, height=16, units='in')

save(file='panel5_cross_validation.RData', lscv, lsmodel, mycv, mymodel, nbcvs, allcvs)
